import time
import glob
import os
import pandas as pd
import joblib
import random
from constants import SAMPLING_RATE, STORAGE_DIR, DATA_FILENAME, IS_PARALLEL, FREQUENCY, FEATURE_NAMES, TOTAL_VINS
from preprocessing import preprocess
from feature_engineering import feature_engr_sync, feature_engr_parallel
import multiprocessing as mp
from joblib import Parallel, delayed
from sklearn.preprocessing import MinMaxScaler

def apply_parallel_gen(dfGrouped, func):
    retLst = Parallel(n_jobs=mp.cpu_count())(delayed(func)(group) for name, group in dfGrouped)
    print (f" Total VINS: {len(retLst)} \n Total failed VINS: {len(retLst) - sum(retLst)}")

def apply_parallel_iter(value_list, func):
    retLst = Parallel(n_jobs=mp.cpu_count())(delayed(func)(val) for val in value_list)
    print (f" Total VINS: {len(retLst)} \n Total failed VINS: {len(retLst) - sum(retLst)}")

if __name__ == "__main__":

    #Create base directory to store files
    OG_start_time = start_time = time.time()
    folder_path_preprocess  = os.path.join(STORAGE_DIR, SAMPLING_RATE)
    folder_path_features  = os.path.join(STORAGE_DIR, FREQUENCY)

    if not os.path.exists(folder_path_preprocess): 
        os.makedirs(folder_path_preprocess)
    
    if not os.path.exists(folder_path_features):
        os.makedirs(folder_path_features)

    df = pd.read_parquet(DATA_FILENAME, engine='fastparquet') # read dataset

    selected_vins = df.groupby('VIN').size().sort_values(ascending=False).reset_index()["VIN"].tolist()[0:TOTAL_VINS]
    df = df[df['VIN'].isin(selected_vins)]

    grouped = df.groupby('VIN')

    if IS_PARALLEL:
        print (f"Running in parallel")
        apply_parallel_gen(grouped, preprocess)
    
    else:
        print (f"Parallel processing disabled")
        for i,g in grouped:  
            print (i)              
            preprocess(g)

    print("Time taken for execution of Preprocessing --- %s mins ---" % ((time.time() - start_time)/60.0))
    start_time = time.time()
    preprocessed_csvfiles =  sorted(glob.glob(os.path.join(folder_path_preprocess, '*.csv')))

    if IS_PARALLEL:
        apply_parallel_iter(preprocessed_csvfiles, feature_engr_parallel)
        print("Time taken for execution of Feature Engineering --- %s mins ---" % ((time.time() - start_time)/60.0))
        start_time = time.time()
        scaler = MinMaxScaler()
        for file_name in sorted(glob.glob(os.path.join(folder_path_features, '*.csv'))):
            feat = pd.read_csv(file_name)
            scaler.partial_fit(feat.loc[:,FEATURE_NAMES])  
        print("Time taken for execution of Scaler MinMax() processing --- %s mins ---" % ((time.time() - start_time)/60.0))

    else:
        scaler = MinMaxScaler()
        for _file in preprocessed_csvfiles:
            f = feature_engr_sync(_file, scaler)
        print("Time taken for execution of Feature Engineering --- %s mins ---" % ((time.time() - start_time)/60.0))
        
                
    scaler_file = os.path.join(folder_path_features, 'scaler.save')
    joblib.dump(scaler, scaler_file) # save scalerpower_score
    
    print("Time taken for execution --- %s mins ---" % ((time.time() - OG_start_time)/60.0))
    
